In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
import folium
from pyproj import Transformer
import matplotlib.colors as mcolors
In [ ]:
In [7]:
df.columns.tolist()
Out[7]:
['sample.sampleDateTime', 'sample.samplingPoint.easting', 'sample.samplingPoint.northing', 'sample.sampledMaterialType.label', 'Ammonia(N)', 'BOD ATU', 'Nitrate-N', 'Nitrite-N', 'O Diss %sat', 'Orthophospht', 'Phosphorus-P', 'Temp Water', 'TurbidityNTU', 'pH', 'year', 'month', 'day', 'day_of_week', 'location_cluster', 'Ammonia(N)_rolling7', 'BOD ATU_rolling7', 'Nitrate-N_rolling7', 'Nitrite-N_rolling7', 'O Diss %sat_rolling7', 'Orthophospht_rolling7', 'Phosphorus-P_rolling7', 'Temp Water_rolling7', 'TurbidityNTU_rolling7', 'pH_rolling7', 'Ammonia(N)_lag1', 'Ammonia(N)_lag2', 'BOD ATU_lag1', 'BOD ATU_lag2', 'Nitrate-N_lag1', 'Nitrate-N_lag2', 'Nitrite-N_lag1', 'Nitrite-N_lag2', 'O Diss %sat_lag1', 'O Diss %sat_lag2', 'Orthophospht_lag1', 'Orthophospht_lag2', 'Phosphorus-P_lag1', 'Phosphorus-P_lag2', 'Temp Water_lag1', 'Temp Water_lag2', 'TurbidityNTU_lag1', 'TurbidityNTU_lag2', 'pH_lag1', 'pH_lag2']
In [ ]:
In [9]:
df.head
Out[9]:
<bound method NDFrame.head of sample.sampleDateTime sample.samplingPoint.easting \
0 2020-01-02 08:20:00 512248
1 2020-01-02 11:25:00 512242
2 2020-01-02 11:33:00 479766
3 2020-01-02 12:12:00 271427
4 2020-01-02 12:27:00 473010
... ... ...
226226 2024-12-16 10:56:00 464978
226227 2024-12-16 11:02:00 470000
226228 2024-12-16 11:12:00 464978
226229 2024-12-16 11:18:00 464978
226230 2024-12-18 09:55:00 334600
sample.samplingPoint.northing sample.sampledMaterialType.label \
0 302927 GROUNDWATER
1 302941 GROUNDWATER
2 150096 RIVER / RUNNING SURFACE WATER
3 47251 RIVER / RUNNING SURFACE WATER
4 140000 RIVER / RUNNING SURFACE WATER
... ... ...
226226 447907 RIVER / RUNNING SURFACE WATER
226227 260000 RIVER / RUNNING SURFACE WATER
226228 447907 POND / LAKE / RESERVOIR WATER
226229 447907 POND / LAKE / RESERVOIR WATER
226230 290800 RIVER / RUNNING SURFACE WATER
Ammonia(N) BOD ATU Nitrate-N Nitrite-N O Diss %sat Orthophospht \
0 0.1525 1.7 15.895 0.07885 89.4 0.021
1 0.1525 2.8 15.895 0.07885 89.4 0.021
2 0.1525 2.8 15.895 0.07885 89.4 0.021
3 0.0300 2.8 2.690 0.00510 100.0 0.021
4 0.0300 2.8 2.690 0.00510 88.4 0.021
... ... ... ... ... ... ...
226226 0.0300 2.1 1.290 0.00760 93.4 0.032
226227 0.1525 2.1 1.290 0.00760 91.7 0.032
226228 0.1525 2.1 1.290 0.00760 111.7 0.032
226229 0.1525 2.1 1.290 0.00760 109.4 0.032
226230 0.0300 2.1 1.500 0.00400 99.7 0.013
... Orthophospht_lag1 Orthophospht_lag2 Phosphorus-P_lag1 \
0 ... 0.021 0.021 0.037
1 ... 0.021 0.021 0.037
2 ... 0.021 0.021 0.037
3 ... 0.021 0.021 0.037
4 ... 0.021 0.021 0.037
... ... ... ... ...
226226 ... 0.032 0.032 0.013
226227 ... 0.250 NaN 0.270
226228 ... 0.032 0.032 0.013
226229 ... 0.032 0.032 0.013
226230 ... 0.024 0.026 0.086
Phosphorus-P_lag2 Temp Water_lag1 Temp Water_lag2 \
0 0.037 10.5 10.5
1 0.037 10.5 10.5
2 0.037 10.5 10.5
3 0.037 8.6 8.6
4 0.037 9.9 9.9
... ... ... ...
226226 0.013 8.5 8.0
226227 NaN 14.3 NaN
226228 0.013 6.4 7.1
226229 0.013 7.9 6.4
226230 0.036 1.8 10.2
TurbidityNTU_lag1 TurbidityNTU_lag2 pH_lag1 pH_lag2
0 5.0 5.0 7.36 7.36
1 5.0 5.0 7.36 7.36
2 5.0 5.0 7.36 7.36
3 5.0 5.0 7.60 7.60
4 5.0 5.0 7.74 7.74
... ... ... ... ...
226226 8.5 8.5 7.51 7.27
226227 6.2 NaN 7.00 NaN
226228 8.5 8.5 8.40 8.34
226229 8.5 8.5 8.21 8.40
226230 21.7 3.9 7.39 7.60
[226231 rows x 49 columns]>
In [ ]:
In [5]:
file_path = '/Users/sofianebelbrik/Documents/water pollution/Data/Preprocessed_Data/final_water_quality_df.csv'
df = pd.read_csv(file_path)
In [ ]:
In [10]:
parameters = [
'Ammonia(N)', 'BOD ATU', 'Nitrate-N', 'Nitrite-N',
'O Diss %sat', 'Orthophospht', 'Phosphorus-P',
'Temp Water', 'TurbidityNTU', 'pH'
]
In [ ]:
In [12]:
print(df.columns)
Index(['sample.sampleDateTime', 'sample.samplingPoint.easting',
'sample.samplingPoint.northing', 'sample.sampledMaterialType.label',
'Ammonia(N)', 'BOD ATU', 'Nitrate-N', 'Nitrite-N', 'O Diss %sat',
'Orthophospht', 'Phosphorus-P', 'Temp Water', 'TurbidityNTU', 'pH',
'year', 'month', 'day', 'day_of_week', 'location_cluster',
'Ammonia(N)_rolling7', 'BOD ATU_rolling7', 'Nitrate-N_rolling7',
'Nitrite-N_rolling7', 'O Diss %sat_rolling7', 'Orthophospht_rolling7',
'Phosphorus-P_rolling7', 'Temp Water_rolling7', 'TurbidityNTU_rolling7',
'pH_rolling7', 'Ammonia(N)_lag1', 'Ammonia(N)_lag2', 'BOD ATU_lag1',
'BOD ATU_lag2', 'Nitrate-N_lag1', 'Nitrate-N_lag2', 'Nitrite-N_lag1',
'Nitrite-N_lag2', 'O Diss %sat_lag1', 'O Diss %sat_lag2',
'Orthophospht_lag1', 'Orthophospht_lag2', 'Phosphorus-P_lag1',
'Phosphorus-P_lag2', 'Temp Water_lag1', 'Temp Water_lag2',
'TurbidityNTU_lag1', 'TurbidityNTU_lag2', 'pH_lag1', 'pH_lag2'],
dtype='object')
In [ ]:
In [14]:
df.head()
Out[14]:
| sample.sampleDateTime | sample.samplingPoint.easting | sample.samplingPoint.northing | sample.sampledMaterialType.label | Ammonia(N) | BOD ATU | Nitrate-N | Nitrite-N | O Diss %sat | Orthophospht | ... | Orthophospht_lag1 | Orthophospht_lag2 | Phosphorus-P_lag1 | Phosphorus-P_lag2 | Temp Water_lag1 | Temp Water_lag2 | TurbidityNTU_lag1 | TurbidityNTU_lag2 | pH_lag1 | pH_lag2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2020-01-02 08:20:00 | 512248 | 302927 | GROUNDWATER | 0.1525 | 1.7 | 15.895 | 0.07885 | 89.4 | 0.021 | ... | 0.021 | 0.021 | 0.037 | 0.037 | 10.5 | 10.5 | 5.0 | 5.0 | 7.36 | 7.36 |
| 1 | 2020-01-02 11:25:00 | 512242 | 302941 | GROUNDWATER | 0.1525 | 2.8 | 15.895 | 0.07885 | 89.4 | 0.021 | ... | 0.021 | 0.021 | 0.037 | 0.037 | 10.5 | 10.5 | 5.0 | 5.0 | 7.36 | 7.36 |
| 2 | 2020-01-02 11:33:00 | 479766 | 150096 | RIVER / RUNNING SURFACE WATER | 0.1525 | 2.8 | 15.895 | 0.07885 | 89.4 | 0.021 | ... | 0.021 | 0.021 | 0.037 | 0.037 | 10.5 | 10.5 | 5.0 | 5.0 | 7.36 | 7.36 |
| 3 | 2020-01-02 12:12:00 | 271427 | 47251 | RIVER / RUNNING SURFACE WATER | 0.0300 | 2.8 | 2.690 | 0.00510 | 100.0 | 0.021 | ... | 0.021 | 0.021 | 0.037 | 0.037 | 8.6 | 8.6 | 5.0 | 5.0 | 7.60 | 7.60 |
| 4 | 2020-01-02 12:27:00 | 473010 | 140000 | RIVER / RUNNING SURFACE WATER | 0.0300 | 2.8 | 2.690 | 0.00510 | 88.4 | 0.021 | ... | 0.021 | 0.021 | 0.037 | 0.037 | 9.9 | 9.9 | 5.0 | 5.0 | 7.74 | 7.74 |
5 rows × 49 columns
In [ ]:
In [16]:
# Step 1: Combine 'year', 'month', and 'day' into a single datetime column.
df['date'] = pd.to_datetime(df[['year', 'month', 'day']])
# Step 2: Sort the DataFrame by date (and optionally by water type) to ensure chronological order.
df = df.sort_values(['date', 'sample.sampledMaterialType.label'])
# Step 3: Group by both 'date' and water type and compute the average for each parameter.
daily_df = df.groupby(['date', 'sample.sampledMaterialType.label'])[parameters].mean().reset_index()
# Step 4: Identify the unique water types.
water_types = daily_df['sample.sampledMaterialType.label'].unique()
# Step 5: For each water type, plot daily trends for each parameter.
for water_type in water_types:
# Subset the data for this water type.
subset = daily_df[daily_df['sample.sampledMaterialType.label'] == water_type]
# Create a grid of subplots (e.g., 5 rows x 2 columns if you have 10 parameters).
fig, axs = plt.subplots(nrows=5, ncols=2, figsize=(15, 20))
axs = axs.flatten() # Flatten the array for easier iteration.
# Loop through each parameter and plot its daily average trend.
for i, param in enumerate(parameters):
axs[i].plot(subset['date'], subset[param], marker='o', linestyle='-', alpha=0.8, label=param)
axs[i].set_title(f'{param} Daily Average - {water_type}')
axs[i].set_xlabel('Date')
axs[i].set_ylabel(param)
axs[i].legend()
axs[i].grid(True)
plt.tight_layout()
plt.show()
In [ ]:
In [18]:
# Step 1: Create a new column 'month_date' that represents the first day of each month.
df['month_date'] = pd.to_datetime(df[['year', 'month']].assign(day=1))
# Step 2: Group by both 'month_date' and water type to calculate the mean for each parameter.
monthly_df = df.groupby(['month_date', 'sample.sampledMaterialType.label'])[parameters].mean().reset_index()
# Sort the monthly_df by month_date (and water type if needed)
monthly_df.sort_values(['month_date', 'sample.sampledMaterialType.label'], inplace=True)
# Step 3: Identify unique water types.
water_types = monthly_df['sample.sampledMaterialType.label'].unique()
# Step 4: For each water type, plot the monthly trends for each parameter.
for water_type in water_types:
# Subset the DataFrame for the current water type.
subset = monthly_df[monthly_df['sample.sampledMaterialType.label'] == water_type]
# Create subplots grid for the parameters (e.g., 5 rows x 2 columns if you have 10 parameters).
fig, axs = plt.subplots(nrows=5, ncols=2, figsize=(15, 20))
axs = axs.flatten() # Flatten the array for easier iteration.
# Loop through each parameter and plot its monthly mean trend.
for i, param in enumerate(parameters):
axs[i].plot(subset['month_date'], subset[param], marker='o', linestyle='-', alpha=0.8, label=param)
axs[i].set_title(f'{param} Monthly Average - {water_type}')
axs[i].set_xlabel('Month')
axs[i].set_ylabel(param)
axs[i].legend()
axs[i].grid(True)
plt.tight_layout()
plt.show()
In [ ]:
In [ ]:
yearly_df = df.groupby('year')[parameters].mean().reset_index()
yearly_df.sort_values('year', inplace=True)
fig, axs = plt.subplots(nrows=5, ncols=2, figsize=(15, 20))
axs = axs.flatten() # Flatten the 2D array for easier iteration
# Loop through each parameter and plot its yearly average values.
for i, param in enumerate(parameters):
axs[i].plot(yearly_df['year'], yearly_df[param], marker='o', linestyle='-', alpha=0.8, label=param)
axs[i].set_title(f'{param} Yearly Values')
axs[i].set_xlabel('Year')
axs[i].set_ylabel(param)
axs[i].legend()
axs[i].grid(True)
plt.tight_layout()
plt.show()
In [ ]:
In [12]:
# Step 1: Create a yearly DataFrame that groups by both year and water type.
yearly_by_water = df.groupby(['year', 'sample.sampledMaterialType.label'])[parameters].mean().reset_index()
# Step 2: Identify unique water types.
water_types = yearly_by_water['sample.sampledMaterialType.label'].unique()
# Step 3: Loop over each water type, compute its correlation matrix, and plot it.
for wt in water_types:
# Subset the data for the current water type.
subset = yearly_by_water[yearly_by_water['sample.sampledMaterialType.label'] == wt]
# Compute the correlation matrix for the water quality parameters.
corr_matrix = subset[parameters].corr()
# Create a heatmap for the correlation matrix.
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title(f'Yearly Correlation Matrix for {wt}')
plt.xlabel('Parameters')
plt.ylabel('Parameters')
plt.show()
In [ ]:
In [ ]:
# Example ideal ranges (modify these with proper values for your case)
# Adjusted ideal ranges for mixed water sources
ideal_ranges = {
'Ammonia(N)': {'min': 0.0, 'max': 1.0}, # mg/L (combined groundwater & surface water)
'BOD ATU': {'min': 0.0, 'max': 4.0}, # mg/L (higher than drinking water, but realistic for rivers/lakes)
'Nitrate-N': {'min': 0.0, 'max': 30.0}, # mg/L (accounts for river pollution but stays below extreme levels)
'Nitrite-N': {'min': 0.0, 'max': 0.3}, # mg/L (slightly above drinking water limit to allow for natural variance)
'O Diss %sat': {'min': 50, 'max': 120}, # % (oxygen variation in natural waters)
'Orthophospht': {'min': 0.0, 'max': 2.0}, # mg/L (to account for agricultural runoff in some sources)
'Phosphorus-P': {'min': 0.0, 'max': 0.15}, # mg/L (higher than drinking water limit but within natural ranges)
'Temp Water': {'min': 0.0, 'max': 30.0}, # °C (covers groundwater & river/lake seasonal changes)
'TurbidityNTU': {'min': 0.0, 'max': 20.0}, # NTU (since mixed sources include rivers, which are naturally higher)
'pH': {'min': 6.0, 'max': 9.0} # pH scale (general range covering all sources)
}
# Example weights (summing to 1, or use relative weights)
weights = {
'Ammonia(N)': 0.1,
'BOD ATU': 0.1,
'Nitrate-N': 0.1,
'Nitrite-N': 0.1,
'O Diss %sat': 0.15,
'Orthophospht': 0.1,
'Phosphorus-P': 0.1,
'Temp Water': 0.1,
'TurbidityNTU': 0.1,
'pH': 0.05
}
In [ ]:
In [ ]:
import numpy as np
def calc_parameter_score(value, ideal_range, k=3):
"""
Compute a normalized score for a parameter value given its ideal range.
- If the value is within the ideal range, the score is 1.
- If the value is below the ideal range, the relative deviation is
computed as (min_val - value) / min_val.
- If the value is above the ideal range, the relative deviation is
computed as (value - max_val) / max_val.
The final score is calculated as:
score = exp(-k * deviation)
where k is a decay constant (adjustable) that reflects the gravity of the deviation.
This returns a score between 0 and 1.
"""
min_val = ideal_range['min']
max_val = ideal_range['max']
if min_val <= value <= max_val:
return 1.0
else:
if value < min_val:
deviation = (min_val - value) / (min_val if min_val != 0 else 1)
else: # value > max_val
deviation = (value - max_val) / (max_val if max_val != 0 else 1)
# Exponential decay to penalize larger deviations more severely.
score = np.exp(-k * deviation)
return score
In [ ]:
In [ ]:
def compute_water_quality_score(row, ideal_ranges, weights, k=3):
"""
Compute an overall water quality score for a given row.
For each parameter, calculate a normalized score using calc_parameter_score,
then combine the scores using a weighted average.
Returns a score between 0 and 1.
"""
score_sum = 0
total_weight = 0
for param, crit in ideal_ranges.items():
value = row[param]
param_score = calc_parameter_score(value, crit, k=k)
weight = weights.get(param, 1) # Default weight is 1 if not specified.
score_sum += param_score * weight
total_weight += weight
return score_sum / total_weight if total_weight > 0 else 0
# Apply the function to each row of the DataFrame.
df['water_quality_score'] = df.apply(lambda row: compute_water_quality_score(row, ideal_ranges, weights, k=3), axis=1)
In [ ]:
In [ ]:
df.to_csv('final_water_quality_score_.csv', index=False)
In [ ]:
In [ ]:
# Group by the 'date' column and compute the mean water quality score
daily_avg = df.groupby('date')['water_quality_score'].mean().reset_index()
# Plot the daily average water quality score over time
plt.figure(figsize=(12, 6))
plt.plot(daily_avg['date'], daily_avg['water_quality_score'], marker='o', linestyle='-')
plt.title("Daily Average Water Quality Score Over Time")
plt.xlabel("Date")
plt.ylabel("Water Quality Score")
plt.grid(True)
plt.show()
In [ ]:
In [19]:
import numpy as np
import pandas as pd
# Example: Define ideal ranges for each water quality parameter, separately for each water type.
ideal_ranges_by_water = {
'GROUNDWATER': {
'Ammonia(N)': {'min': 0.0, 'max': 0.5}, # Example values for groundwater
'BOD ATU': {'min': 0.0, 'max': 2.0},
'Nitrate-N': {'min': 0.0, 'max': 10.0},
'Nitrite-N': {'min': 0.0, 'max': 0.1},
'O Diss %sat': {'min': 80, 'max': 100},
'Orthophospht': {'min': 0.0, 'max': 0.5},
'Phosphorus-P': {'min': 0.0, 'max': 0.05},
'Temp Water': {'min': 5.0, 'max': 25.0},
'TurbidityNTU': {'min': 0.0, 'max': 5.0},
'pH': {'min': 6.5, 'max': 8.5}
},
'RIVER / RUNNING SURFACE WATER': {
'Ammonia(N)': {'min': 0.0, 'max': 1.0},
'BOD ATU': {'min': 0.0, 'max': 4.0},
'Nitrate-N': {'min': 0.0, 'max': 30.0},
'Nitrite-N': {'min': 0.0, 'max': 0.3},
'O Diss %sat': {'min': 50, 'max': 120},
'Orthophospht': {'min': 0.0, 'max': 2.0},
'Phosphorus-P': {'min': 0.0, 'max': 0.15},
'Temp Water': {'min': 0.0, 'max': 30.0},
'TurbidityNTU': {'min': 0.0, 'max': 20.0},
'pH': {'min': 6.0, 'max': 9.0}
},
'POND / LAKE / RESERVOIR WATER': {
'Ammonia(N)': {'min': 0.0, 'max': 0.8},
'BOD ATU': {'min': 0.0, 'max': 3.0},
'Nitrate-N': {'min': 0.0, 'max': 20.0},
'Nitrite-N': {'min': 0.0, 'max': 0.2},
'O Diss %sat': {'min': 60, 'max': 110},
'Orthophospht': {'min': 0.0, 'max': 1.5},
'Phosphorus-P': {'min': 0.0, 'max': 0.1},
'Temp Water': {'min': 0.0, 'max': 28.0},
'TurbidityNTU': {'min': 0.0, 'max': 15.0},
'pH': {'min': 6.5, 'max': 9.0}
}
}
# Example parameter weights (these weights indicate relative importance, summing to 1)
weights = {
'Ammonia(N)': 0.1,
'BOD ATU': 0.1,
'Nitrate-N': 0.1,
'Nitrite-N': 0.1,
'O Diss %sat': 0.15,
'Orthophospht': 0.1,
'Phosphorus-P': 0.1,
'Temp Water': 0.1,
'TurbidityNTU': 0.1,
'pH': 0.05
}
# Function to compute the score for a single parameter value based on its ideal range.
def calc_parameter_score(value, ideal_range, k=3):
"""
If value is within the ideal range, score is 1.
If not, compute relative deviation and use an exponential decay function:
score = exp(-k * deviation)
Returns a score between 0 and 1.
"""
min_val = ideal_range['min']
max_val = ideal_range['max']
if min_val <= value <= max_val:
return 1.0
else:
if value < min_val:
deviation = (min_val - value) / (min_val if min_val != 0 else 1)
else: # value > max_val
deviation = (value - max_val) / (max_val if max_val != 0 else 1)
score = np.exp(-k * deviation)
return score
# Function to compute the overall water quality score for a given row,
# using the ideal ranges specific to the water type of that row.
def compute_water_quality_score_by_type(row, ideal_ranges_by_water, weights, k=3):
water_type = row['sample.sampledMaterialType.label']
score_sum = 0
total_weight = 0
# Loop through each parameter in our ideal_ranges.
for param, weight in weights.items():
# Get the ideal range for the parameter for the current water type.
ideal_range = ideal_ranges_by_water[water_type][param]
value = row[param]
param_score = calc_parameter_score(value, ideal_range, k)
score_sum += param_score * weight
total_weight += weight
return score_sum / total_weight if total_weight > 0 else 0
# Apply the function to each row in the DataFrame.
# 'df' should be your DataFrame containing the water quality parameters and water type column.
df['water_quality_score'] = df.apply(lambda row: compute_water_quality_score_by_type(row, ideal_ranges_by_water, weights, k=3), axis=1)
# The resulting 'water_quality_score' column contains a normalized score (between 0 and 1)
# for each sample, where 1 indicates the parameter values are within the ideal ranges,
# and lower scores indicate larger deviations.
In [27]:
df.head(40)
Out[27]:
| sample.sampleDateTime | sample.samplingPoint.easting | sample.samplingPoint.northing | sample.sampledMaterialType.label | Ammonia(N) | BOD ATU | Nitrate-N | Nitrite-N | O Diss %sat | Orthophospht | ... | Orthophospht_lag2 | Phosphorus-P_lag1 | Phosphorus-P_lag2 | Temp Water_lag1 | Temp Water_lag2 | TurbidityNTU_lag1 | TurbidityNTU_lag2 | pH_lag1 | pH_lag2 | water_quality_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2020-01-02 08:20:00 | 512248 | 302927 | GROUNDWATER | 0.1525 | 1.7 | 15.895 | 0.07885 | 89.40 | 0.021 | ... | 0.021 | 0.037 | 0.037 | 10.5 | 10.5 | 5.0 | 5.0 | 7.36 | 7.36 | 0.917059 |
| 1 | 2020-01-02 11:25:00 | 512242 | 302941 | GROUNDWATER | 0.1525 | 2.8 | 15.895 | 0.07885 | 89.40 | 0.021 | ... | 0.021 | 0.037 | 0.037 | 10.5 | 10.5 | 5.0 | 5.0 | 7.36 | 7.36 | 0.847178 |
| 2 | 2020-01-02 11:33:00 | 479766 | 150096 | RIVER / RUNNING SURFACE WATER | 0.1525 | 2.8 | 15.895 | 0.07885 | 89.40 | 0.021 | ... | 0.021 | 0.037 | 0.037 | 10.5 | 10.5 | 5.0 | 5.0 | 7.36 | 7.36 | 1.000000 |
| 3 | 2020-01-02 12:12:00 | 271427 | 47251 | RIVER / RUNNING SURFACE WATER | 0.0300 | 2.8 | 2.690 | 0.00510 | 100.00 | 0.021 | ... | 0.021 | 0.037 | 0.037 | 8.6 | 8.6 | 5.0 | 5.0 | 7.60 | 7.60 | 1.000000 |
| 4 | 2020-01-02 12:27:00 | 473010 | 140000 | RIVER / RUNNING SURFACE WATER | 0.0300 | 2.8 | 2.690 | 0.00510 | 88.40 | 0.021 | ... | 0.021 | 0.037 | 0.037 | 9.9 | 9.9 | 5.0 | 5.0 | 7.74 | 7.74 | 1.000000 |
| 5 | 2020-01-02 12:46:00 | 478160 | 137000 | RIVER / RUNNING SURFACE WATER | 0.0300 | 2.8 | 2.690 | 0.00510 | 89.10 | 0.021 | ... | 0.021 | 0.037 | 0.037 | 8.6 | 8.6 | 5.0 | 5.0 | 7.77 | 7.77 | 1.000000 |
| 6 | 2020-01-03 08:50:00 | 512245 | 302928 | GROUNDWATER | 0.0300 | 3.0 | 2.690 | 0.00510 | 89.10 | 0.021 | ... | 0.021 | 0.037 | 0.037 | 8.6 | 8.6 | 5.0 | 5.0 | 7.77 | 7.77 | 0.922313 |
| 7 | 2020-01-03 08:55:00 | 512244 | 302928 | GROUNDWATER | 0.0300 | 1.5 | 2.690 | 0.00510 | 89.10 | 0.021 | ... | 0.021 | 0.037 | 0.037 | 8.6 | 8.6 | 5.0 | 5.0 | 7.77 | 7.77 | 1.000000 |
| 8 | 2020-01-03 11:10:00 | 500000 | 2 | RIVER / RUNNING SURFACE WATER | 0.1525 | 1.3 | 2.690 | 0.00510 | 89.00 | 0.210 | ... | 0.210 | 0.037 | 0.037 | 8.0 | 8.0 | 5.0 | 5.0 | 7.77 | 7.77 | 1.000000 |
| 9 | 2020-01-03 11:15:00 | 604515 | 222393 | POND / LAKE / RESERVOIR WATER | 0.1525 | 1.3 | 2.690 | 0.00510 | 84.20 | 0.210 | ... | 0.210 | 0.037 | 0.037 | 8.5 | 8.5 | 5.0 | 5.0 | 7.28 | 7.28 | 1.000000 |
| 10 | 2020-01-03 11:31:00 | 500000 | 2 | RIVER / RUNNING SURFACE WATER | 0.1525 | 1.2 | 2.690 | 0.00510 | 88.80 | 0.210 | ... | 0.210 | 0.037 | 0.037 | 8.0 | 8.0 | 5.0 | 5.0 | 7.77 | 7.77 | 1.000000 |
| 11 | 2020-01-03 11:39:00 | 244360 | 124690 | POND / LAKE / RESERVOIR WATER | 0.1525 | 1.2 | 2.690 | 0.00510 | 88.80 | 0.210 | ... | 0.210 | 0.063 | 0.063 | 8.0 | 8.0 | 5.0 | 5.0 | 7.28 | 7.28 | 1.000000 |
| 12 | 2020-01-03 11:45:00 | 600000 | 220000 | POND / LAKE / RESERVOIR WATER | 0.1525 | 1.2 | 2.690 | 0.00510 | 74.20 | 0.210 | ... | 0.210 | 0.063 | 0.063 | 7.3 | 7.3 | 5.0 | 5.0 | 7.36 | 7.36 | 1.000000 |
| 13 | 2020-01-03 12:15:00 | 500000 | 4 | RIVER / RUNNING SURFACE WATER | 0.1525 | 5.7 | 2.690 | 0.00510 | 74.20 | 0.345 | ... | 0.345 | 0.063 | 0.063 | 7.3 | 7.3 | 21.7 | 21.7 | 7.36 | 7.36 | 0.905435 |
| 14 | 2020-01-03 12:15:00 | 604495 | 222258 | POND / LAKE / RESERVOIR WATER | 0.1525 | 5.7 | 2.690 | 0.00510 | 72.60 | 0.345 | ... | 0.345 | 0.063 | 0.063 | 7.2 | 7.2 | 21.7 | 21.7 | 7.41 | 7.41 | 0.832905 |
| 15 | 2020-01-03 13:25:00 | 604540 | 222481 | RIVER / RUNNING SURFACE WATER | 0.1525 | 5.7 | 2.690 | 0.00510 | 64.85 | 0.345 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.905435 |
| 16 | 2020-01-03 13:35:00 | 600000 | 220000 | RIVER / RUNNING SURFACE WATER | 0.1525 | 5.7 | 2.690 | 0.00510 | 72.10 | 0.345 | ... | 0.345 | 0.063 | 0.063 | 8.6 | 8.6 | 21.7 | 21.7 | 7.49 | 7.49 | 0.905435 |
| 17 | 2020-01-03 14:05:00 | 484160 | 380980 | RIVER / RUNNING SURFACE WATER | 0.1525 | 1.0 | 2.690 | 0.00510 | 72.70 | 0.010 | ... | 0.010 | 0.063 | 0.063 | 7.6 | 7.6 | 21.7 | 21.7 | 7.51 | 7.51 | 0.977492 |
| 18 | 2020-01-03 15:50:00 | 368500 | 155500 | RIVER / RUNNING SURFACE WATER | 0.1525 | 5.7 | 2.690 | 0.00510 | 72.70 | 0.010 | ... | 0.010 | 0.063 | 0.063 | 7.6 | 7.6 | 21.7 | 21.7 | 7.51 | 7.51 | 0.905435 |
| 19 | 2020-01-03 16:50:00 | 368500 | 155500 | RIVER / RUNNING SURFACE WATER | 0.1525 | 4.5 | 2.690 | 0.07885 | 89.20 | 0.345 | ... | 0.010 | 0.063 | 0.063 | 7.6 | 7.6 | 21.7 | 21.7 | 7.51 | 7.51 | 0.946221 |
| 20 | 2020-01-05 11:10:00 | 560000 | 310000 | RIVER / RUNNING SURFACE WATER | 0.1525 | 5.7 | 2.690 | 0.07885 | 64.85 | 0.345 | ... | 0.345 | 0.063 | 0.063 | 9.9 | 9.9 | 21.7 | 21.7 | 7.07 | 7.07 | 0.905435 |
| 21 | 2020-01-06 07:58:00 | 415855 | 537065 | RIVER / RUNNING SURFACE WATER | 0.0340 | 5.7 | 0.996 | 0.00400 | 99.80 | 0.180 | ... | 0.180 | 0.063 | 0.063 | 6.0 | 6.0 | 21.7 | 21.7 | 8.15 | 8.15 | 0.905435 |
| 22 | 2020-01-06 07:59:00 | 399885 | 630775 | RIVER / RUNNING SURFACE WATER | 0.0150 | 5.7 | 2.490 | 0.00720 | 99.20 | 0.025 | ... | 0.025 | 0.034 | 0.034 | 6.7 | 6.7 | 21.7 | 21.7 | 8.15 | 8.15 | 0.905435 |
| 23 | 2020-01-06 08:03:00 | 516196 | 156369 | RIVER / RUNNING SURFACE WATER | 0.0580 | 5.7 | 7.330 | 0.06900 | 84.60 | 0.160 | ... | 0.160 | 0.034 | 0.034 | 8.3 | 8.3 | 21.7 | 21.7 | 7.38 | 7.38 | 0.905435 |
| 24 | 2020-01-06 08:07:00 | 418707 | 480115 | RIVER / RUNNING SURFACE WATER | 0.0300 | 5.7 | 0.996 | 0.00400 | 101.00 | 0.010 | ... | 0.010 | 0.015 | 0.015 | 7.1 | 7.1 | 21.7 | 21.7 | 7.58 | 7.58 | 0.905435 |
| 25 | 2020-01-06 08:08:00 | 575900 | 147800 | RIVER / RUNNING SURFACE WATER | 0.0920 | 5.7 | 4.950 | 0.04900 | 90.60 | 0.150 | ... | 0.150 | 0.015 | 0.015 | 6.3 | 6.3 | 21.7 | 21.7 | 8.17 | 8.17 | 0.905435 |
| 26 | 2020-01-06 08:10:00 | 506637 | 306926 | RIVER / RUNNING SURFACE WATER | 0.0330 | 1.1 | 9.960 | 0.03900 | 96.70 | 0.086 | ... | 0.086 | 0.015 | 0.015 | 7.8 | 7.8 | 21.7 | 21.7 | 7.76 | 7.76 | 0.977492 |
| 27 | 2020-01-06 08:12:00 | 400072 | 630049 | RIVER / RUNNING SURFACE WATER | 0.0038 | 1.1 | 2.500 | 0.00190 | 99.20 | 0.028 | ... | 0.028 | 0.015 | 0.015 | 6.8 | 6.8 | 21.7 | 21.7 | 7.93 | 7.93 | 0.977492 |
| 28 | 2020-01-06 08:14:00 | 410676 | 537444 | RIVER / RUNNING SURFACE WATER | 0.0690 | 1.1 | 3.870 | 0.02700 | 101.00 | 0.170 | ... | 0.170 | 0.015 | 0.015 | 6.4 | 6.4 | 21.7 | 21.7 | 8.25 | 8.25 | 0.977492 |
| 29 | 2020-01-06 08:16:00 | 516118 | 370210 | RIVER / RUNNING SURFACE WATER | 0.0430 | 1.0 | 3.870 | 0.02700 | 101.00 | 0.075 | ... | 0.075 | 0.015 | 0.015 | 6.4 | 6.4 | 21.7 | 21.7 | 8.25 | 8.25 | 0.977492 |
| 30 | 2020-01-06 08:17:00 | 600109 | 133077 | RIVER / RUNNING SURFACE WATER | 0.1525 | 1.0 | 2.980 | 0.07885 | 66.90 | 0.120 | ... | 0.120 | 0.015 | 0.015 | 7.8 | 7.8 | 21.7 | 21.7 | 7.30 | 7.30 | 0.977492 |
| 31 | 2020-01-06 08:19:00 | 514860 | 157110 | RIVER / RUNNING SURFACE WATER | 0.1525 | 1.0 | 2.980 | 0.07885 | 66.90 | 0.120 | ... | 0.120 | 0.015 | 0.015 | 7.8 | 7.8 | 21.7 | 21.7 | 7.58 | 7.58 | 0.977492 |
| 32 | 2020-01-06 08:28:00 | 573757 | 143465 | RIVER / RUNNING SURFACE WATER | 0.1000 | 1.0 | 3.460 | 0.04200 | 71.80 | 0.150 | ... | 0.150 | 0.015 | 0.015 | 6.6 | 6.6 | 21.7 | 21.7 | 7.75 | 7.75 | 0.977492 |
| 33 | 2020-01-06 08:29:00 | 346800 | 316400 | RIVER / RUNNING SURFACE WATER | 0.0720 | 1.0 | 2.680 | 0.02100 | 95.90 | 0.049 | ... | 0.049 | 0.015 | 0.015 | 7.1 | 7.1 | 6.3 | 6.3 | 7.62 | 7.62 | 1.000000 |
| 34 | 2020-01-06 08:31:00 | 462064 | 122902 | RIVER / RUNNING SURFACE WATER | 0.0300 | 1.0 | 1.700 | 0.00400 | 99.80 | 0.017 | ... | 0.017 | 0.015 | 0.015 | 7.0 | 7.0 | 6.3 | 6.3 | 7.38 | 7.38 | 1.000000 |
| 35 | 2020-01-06 08:31:00 | 419133 | 486001 | RIVER / RUNNING SURFACE WATER | 0.0300 | 1.0 | 1.700 | 0.00400 | 99.80 | 0.017 | ... | 0.017 | 0.015 | 0.015 | 7.0 | 7.0 | 6.3 | 6.3 | 7.98 | 7.98 | 1.000000 |
| 36 | 2020-01-06 08:39:00 | 462152 | 122737 | RIVER / RUNNING SURFACE WATER | 0.0300 | 1.0 | 7.400 | 0.00430 | 86.80 | 0.054 | ... | 0.054 | 0.015 | 0.015 | 9.1 | 9.1 | 6.3 | 6.3 | 7.61 | 7.61 | 1.000000 |
| 37 | 2020-01-06 08:40:00 | 572733 | 143138 | RIVER / RUNNING SURFACE WATER | 0.1300 | 1.0 | 3.360 | 0.03800 | 94.30 | 0.078 | ... | 0.078 | 0.015 | 0.015 | 7.0 | 7.0 | 6.3 | 6.3 | 7.78 | 7.78 | 1.000000 |
| 38 | 2020-01-06 08:43:00 | 416780 | 526584 | RIVER / RUNNING SURFACE WATER | 0.0500 | 1.0 | 2.180 | 0.02200 | 91.70 | 0.059 | ... | 0.059 | 0.015 | 0.015 | 7.2 | 7.2 | 3.6 | 3.6 | 8.16 | 8.16 | 1.000000 |
| 39 | 2020-01-06 08:43:00 | 547878 | 383380 | RIVER / RUNNING SURFACE WATER | 0.1525 | 1.2 | 10.900 | 0.07885 | 86.90 | 0.088 | ... | 0.088 | 0.015 | 0.015 | 5.6 | 5.6 | 3.6 | 3.6 | 7.96 | 7.96 | 1.000000 |
40 rows × 50 columns
In [ ]:
In [ ]:
# Group the DataFrame by 'month_date' and calculate the mean water quality score.
monthly_avg_score = df.groupby('month_date')['water_quality_score'].mean().reset_index()
# Sort the monthly DataFrame by 'month_date' to ensure chronological order.
monthly_avg_score.sort_values('month_date', inplace=True)
# Plot the monthly average water quality score over time.
plt.figure(figsize=(12, 6))
plt.plot(monthly_avg_score['month_date'], monthly_avg_score['water_quality_score'],
marker='o', linestyle='-', alpha=0.8)
plt.title("Monthly Average Water Quality Score Over Time")
plt.xlabel("Month")
plt.ylabel("Water Quality Score")
plt.grid(True)
plt.show()
In [ ]:
In [ ]:
# Group the DataFrame by 'year' and calculate the mean water quality score for each year.
yearly_avg_score = df.groupby('year')['water_quality_score'].mean().reset_index()
# Sort the aggregated DataFrame by 'year' to ensure chronological order.
yearly_avg_score.sort_values('year', inplace=True)
# Plot the yearly average water quality score over time.
plt.figure(figsize=(12, 6))
plt.plot(yearly_avg_score['year'], yearly_avg_score['water_quality_score'],
marker='o', linestyle='-', alpha=0.8)
plt.title("Yearly Average Water Quality Score Over Time")
plt.xlabel("Year")
plt.ylabel("Water Quality Score")
plt.grid(True)
plt.show()
In [ ]:
In [ ]:
# Group the DataFrame by location (Easting and Northing) and compute the mean water quality score.
location_summary = df.groupby(['sample.samplingPoint.easting', 'sample.samplingPoint.northing'])['water_quality_score'].mean().reset_index()
# Now location_summary contains one row per unique location with the average water quality score.
print(location_summary.head())
In [ ]:
In [ ]:
import pandas as pd
import folium
from pyproj import Transformer
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
# --- Step 1: Aggregating the water quality score by location ---
# Group by unique locations (using Easting and Northing) and average the water_quality_score.
location_summary = df.groupby(['sample.samplingPoint.easting', 'sample.samplingPoint.northing'])['water_quality_score'].mean().reset_index()
# --- Step 2: Convert Easting/Northing to Latitude/Longitude ---
transformer = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)
def convert_coords(easting, northing):
# transformer.transform returns (longitude, latitude)
lon, lat = transformer.transform(easting, northing)
return lat, lon
# Apply the conversion function to each row of location_summary.
location_summary['lat'], location_summary['lon'] = zip(*location_summary.apply(
lambda row: convert_coords(row['sample.samplingPoint.easting'], row['sample.samplingPoint.northing']),
axis=1
))
# --- Step 3: Create a Color Mapping for the Water Quality Score ---
colormap = plt.get_cmap('RdYlGn')
def score_to_hex(score):
"""
Converts a water quality score (0 to 1) to a hex color.
"""
# The colormap expects a value in the range [0, 1]. A score of 0 will be red, 1 will be green.
rgba = colormap(score) # returns an (R,G,B,A) tuple
return mcolors.rgb2hex(rgba)
# --- Step 4: Create a Folium Map ---
# Center the map on the average location from the aggregated data.
avg_lat = location_summary['lat'].mean()
avg_lon = location_summary['lon'].mean()
m = folium.Map(location=[avg_lat, avg_lon], zoom_start=10)
# --- Step 5: Add Circle Markers for Each Location ---
# Each marker's color reflects the water quality score.
for idx, row in location_summary.iterrows():
color = score_to_hex(row['water_quality_score'])
folium.CircleMarker(
location=[row['lat'], row['lon']],
radius=5,
popup=f"Score: {row['water_quality_score']:.2f}",
color=color,
fill=True,
fill_color=color,
fill_opacity=0.7
).add_to(m)
# --- Step 6: Display the Map ---
# Save the map to an HTML file and display it.
m.save('water_quality_map.html')
m